library(glmnet)
library(tidyverse)
library(magrittr)
library(pROC)
library(ROCR)
library(MASS)
library(ROSE)
library(caret)
library(leaps)
library(sjPlot)
library(sjlabelled)
library(sjmisc)
#read in full data set
df_simple = read.csv('./bank-full.csv')

#Converting to Factors
df_simple$y <- factor(df_simple$y)
df_simple$job = factor(df_simple$job)
df_simple$marital = factor(df_simple$marital)
df_simple$education = factor(df_simple$education)
df_simple$default = factor(df_simple$default)
df_simple$housing = factor(df_simple$housing)
df_simple$loan = factor(df_simple$loan)
df_simple$contact = factor(df_simple$contact)
df_simple$poutcome = factor(df_simple$poutcome)
df_simple$month = factor(df_simple$month)

#create test train split
set.seed(1)
simple_test_index <- sample(1:nrow(df_simple), size = 0.2*nrow(df_simple))
test_simple <- df_simple[simple_test_index,]
train_simple<- df_simple[-simple_test_index,]
#using clean data set from GitHub
df = read.csv('./clean.csv')
#head(df)

#converting to factors
df$y = as.factor(df$y)
df$job = factor(df$job)
df$marital = factor(df$marital)
df$education = factor(df$education)
df$default = factor(df$default)
df$housing = factor(df$housing)
df$loan = factor(df$loan)
df$contact = factor(df$contact)
df$poutcome = factor(df$poutcome)
df$month = factor(df$month)
#create test train split
set.seed(1)

#balance train data set 

test_index <- sample(1:nrow(df), size = 0.2*nrow(df))
test <- df[test_index,]
train<- df[-test_index,]

both_train <- ovun.sample(y~., data=train, method="both")
both_train <- both_train$data
table(both_train$y)
## 
##    no   yes 
## 14888 15062
#defining response variable

train_resp = train$y
test_resp = as.factor(test$y)

#defining explanatory variables
train_expl = data.matrix(train[,1:16])
test_expl = data.matrix(test[,1:16])
#unclean data proportions
writeLines("Proportion of No to Yes in Orignial Data")
## Proportion of No to Yes in Orignial Data
prop.table(table(df_simple$y))
## 
##        no       yes 
## 0.8830152 0.1169848
writeLines('')
writeLines("Proportion of No to Yes in Test Split from Original Data")
## Proportion of No to Yes in Test Split from Original Data
prop.table(table(test_simple$y))
## 
##        no       yes 
## 0.8786773 0.1213227
#clean data proporation
writeLines("Proportion of No to Yes in Clean/Transformed Data")
## Proportion of No to Yes in Clean/Transformed Data
prop.table(table(df$y))
## 
##        no       yes 
## 0.8730133 0.1269867
writeLines('')
writeLines("Proportion of No to Yes in Clean/Transformed Data with Balancing")
## Proportion of No to Yes in Clean/Transformed Data with Balancing
prop.table(table(both_train$y))
## 
##        no       yes 
## 0.4970952 0.5029048
#create simple model
slr = glm(y ~ ., data = train_simple, family = "binomial")
summary(slr)
## 
## Call:
## glm(formula = y ~ ., family = "binomial", data = train_simple)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9282  -0.3754  -0.2550  -0.1523   3.4323  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -2.508e+00  2.046e-01 -12.258  < 2e-16 ***
## age                 2.548e-04  2.464e-03   0.103 0.917646    
## jobblue-collar     -3.331e-01  8.091e-02  -4.116 3.85e-05 ***
## jobentrepreneur    -4.834e-01  1.438e-01  -3.362 0.000774 ***
## jobhousemaid       -6.201e-01  1.544e-01  -4.016 5.92e-05 ***
## jobmanagement      -1.701e-01  8.177e-02  -2.081 0.037450 *  
## jobretired          1.751e-01  1.092e-01   1.603 0.108850    
## jobself-employed   -3.496e-01  1.259e-01  -2.776 0.005497 ** 
## jobservices        -2.952e-01  9.446e-02  -3.125 0.001779 ** 
## jobstudent          3.225e-01  1.234e-01   2.614 0.008956 ** 
## jobtechnician      -2.180e-01  7.698e-02  -2.832 0.004621 ** 
## jobunemployed      -1.770e-01  1.244e-01  -1.424 0.154578    
## jobunknown         -5.907e-02  2.434e-01  -0.243 0.808243    
## maritalmarried     -1.522e-01  6.624e-02  -2.298 0.021541 *  
## maritalsingle       1.316e-01  7.549e-02   1.744 0.081211 .  
## educationsecondary  1.217e-01  7.138e-02   1.705 0.088176 .  
## educationtertiary   2.990e-01  8.339e-02   3.585 0.000337 ***
## educationunknown    2.032e-01  1.155e-01   1.759 0.078631 .  
## defaultyes         -1.062e-01  1.839e-01  -0.578 0.563480    
## balance             1.153e-05  5.629e-06   2.048 0.040585 *  
## housingyes         -6.549e-01  4.901e-02 -13.363  < 2e-16 ***
## loanyes            -3.440e-01  6.567e-02  -5.239 1.62e-07 ***
## contacttelephone   -2.112e-01  8.460e-02  -2.497 0.012540 *  
## contactunknown     -1.592e+00  8.137e-02 -19.560  < 2e-16 ***
## day                 9.878e-03  2.796e-03   3.532 0.000412 ***
## monthaug           -6.290e-01  8.786e-02  -7.159 8.13e-13 ***
## monthdec            7.345e-01  1.976e-01   3.717 0.000202 ***
## monthfeb           -9.934e-02  1.000e-01  -0.993 0.320628    
## monthjan           -1.222e+00  1.360e-01  -8.984  < 2e-16 ***
## monthjul           -7.983e-01  8.712e-02  -9.163  < 2e-16 ***
## monthjun            4.926e-01  1.042e-01   4.727 2.27e-06 ***
## monthmar            1.606e+00  1.348e-01  11.914  < 2e-16 ***
## monthmay           -3.971e-01  8.105e-02  -4.900 9.58e-07 ***
## monthnov           -8.067e-01  9.386e-02  -8.595  < 2e-16 ***
## monthoct            9.358e-01  1.217e-01   7.687 1.50e-14 ***
## monthsep            7.963e-01  1.371e-01   5.810 6.25e-09 ***
## duration            4.225e-03  7.285e-05  58.000  < 2e-16 ***
## campaign           -9.061e-02  1.124e-02  -8.060 7.62e-16 ***
## pdays              -1.952e-04  3.514e-04  -0.555 0.578576    
## previous            8.261e-03  6.394e-03   1.292 0.196344    
## poutcomeother       2.337e-01  1.002e-01   2.333 0.019648 *  
## poutcomesuccess     2.278e+00  9.230e-02  24.678  < 2e-16 ***
## poutcomeunknown    -1.185e-01  1.049e-01  -1.130 0.258514    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25946  on 36168  degrees of freedom
## Residual deviance: 17268  on 36126  degrees of freedom
## AIC: 17354
## 
## Number of Fisher Scoring iterations: 6
#CI and Odds Ratio
exp(cbind(OR = coef(slr), confint(slr)))
##                            OR      2.5 %     97.5 %
## (Intercept)        0.08146003 0.05449355  0.1215207
## age                1.00025483 0.99542991  1.0050924
## jobblue-collar     0.71671807 0.61164316  0.8399795
## jobentrepreneur    0.61668846 0.46283882  0.8135720
## jobhousemaid       0.53787135 0.39522358  0.7242655
## jobmanagement      0.84354388 0.71888612  0.9905529
## jobretired         1.19135111 0.96140278  1.4751354
## jobself-employed   0.70496348 0.54926518  0.8999904
## jobservices        0.74438926 0.61797162  0.8950207
## jobstudent         1.38061574 1.08275410  1.7565309
## jobtechnician      0.80410087 0.69161212  0.9352702
## jobunemployed      0.83775803 0.65487550  1.0664679
## jobunknown         0.94263760 0.57586557  1.4977698
## maritalmarried     0.85877370 0.75486413  0.9787217
## maritalsingle      1.14069056 0.98442411  1.3234801
## educationsecondary 1.12943593 0.98267036  1.3000297
## educationtertiary  1.34846024 1.14571646  1.5887769
## educationunknown   1.22527426 0.97543847  1.5343390
## defaultyes         0.89923040 0.61811996  1.2726620
## balance            1.00001153 1.00000029  1.0000224
## housingyes         0.51948490 0.47181741  0.5717653
## loanyes            0.70889846 0.62251575  0.8053401
## contacttelephone   0.80960549 0.68466945  0.9539734
## contactunknown     0.20361164 0.17343521  0.2386078
## day                1.00992653 1.00440654  1.0154775
## monthaug           0.53311478 0.44884076  0.6334263
## monthdec           2.08445012 1.41218540  3.0662383
## monthfeb           0.90543540 0.74402152  1.1012781
## monthjan           0.29470002 0.22488150  0.3833568
## monthjul           0.45009364 0.37943073  0.5339145
## monthjun           1.63650295 1.33432500  2.0075304
## monthmar           4.98074485 3.82445399  6.4874834
## monthmay           0.67223651 0.57368770  0.7882740
## monthnov           0.44630825 0.37110619  0.5361949
## monthoct           2.54917636 2.00725797  3.2350693
## monthsep           2.21735747 1.69391487  2.8992936
## duration           1.00423441 1.00409188  1.0043787
## campaign           0.91337835 0.89315095  0.9333850
## pdays              0.99980482 0.99911377  1.0004913
## previous           1.00829532 0.99502452  1.0235788
## poutcomeother      1.26327446 1.03693016  1.5358046
## poutcomesuccess    9.75413575 8.14662209 11.6983358
## poutcomeunknown    0.88823582 0.72384161  1.0921812
#predictions
slrPredict = predict(slr,newdata = test_simple,type = 'response')


plot(slrPredict,
     col = test_simple$y,
     main = 'Simple Log Regression Predictions',
     ylab = 'Prediction',
     xlab = 'Data Index')

slrPredictnew<-factor(ifelse(as.numeric(slrPredict)>.5,'yes','no' ) ,levels = c('no','yes'))
confusionMatrix(slrPredictnew,test_simple$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7751  679
##        yes  194  418
##                                           
##                Accuracy : 0.9035          
##                  95% CI : (0.8972, 0.9095)
##     No Information Rate : 0.8787          
##     P-Value [Acc > NIR] : 5.575e-14       
##                                           
##                   Kappa : 0.4406          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9756          
##             Specificity : 0.3810          
##          Pos Pred Value : 0.9195          
##          Neg Pred Value : 0.6830          
##              Prevalence : 0.8787          
##          Detection Rate : 0.8572          
##    Detection Prevalence : 0.9323          
##       Balanced Accuracy : 0.6783          
##                                           
##        'Positive' Class : no              
## 
#Pearson Chi^2 Test
sum(residuals(slr, type = "pearson")^2)
## [1] 411212.5
#Plotting Odds Ratios
plot_model(slr,sort.est = TRUE,grid = TRUE,title = 'Odds Ratio for Explanitory Variables: SLR')

#create simple model
slr_clean = glm(y ~ ., data = train, family = "binomial")
summary(slr_clean)
## 
## Call:
## glm(formula = y ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9889  -0.3955  -0.2112  -0.0950   4.0799  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.171e+01  2.972e-01 -39.393  < 2e-16 ***
## age                -1.743e-03  2.638e-03  -0.661  0.50878    
## jobblue-collar     -3.700e-01  8.781e-02  -4.213 2.52e-05 ***
## jobentrepreneur    -4.320e-01  1.542e-01  -2.801  0.00509 ** 
## jobhousemaid       -4.071e-01  1.592e-01  -2.557  0.01057 *  
## jobmanagement      -1.290e-01  8.895e-02  -1.451  0.14685    
## jobretired          1.694e-01  1.169e-01   1.448  0.14758    
## jobself-employed   -3.460e-01  1.320e-01  -2.621  0.00878 ** 
## jobservices        -2.932e-01  1.021e-01  -2.871  0.00409 ** 
## jobstudent          2.784e-01  1.329e-01   2.095  0.03620 *  
## jobtechnician      -1.641e-01  8.340e-02  -1.967  0.04914 *  
## jobunemployed      -1.913e-01  1.316e-01  -1.454  0.14604    
## jobunknown         -2.584e-01  2.809e-01  -0.920  0.35768    
## maritalmarried     -2.053e-01  7.206e-02  -2.849  0.00439 ** 
## maritalsingle       5.425e-02  8.176e-02   0.663  0.50703    
## educationsecondary  1.197e-01  7.645e-02   1.566  0.11733    
## educationtertiary   2.717e-01  8.908e-02   3.050  0.00229 ** 
## educationunknown    1.485e-01  1.235e-01   1.203  0.22911    
## defaultyes         -5.871e-03  3.221e-01  -0.018  0.98546    
## balance             5.783e-02  1.441e-02   4.012 6.01e-05 ***
## housingyes         -6.791e-01  5.238e-02 -12.965  < 2e-16 ***
## loanyes            -3.749e-01  7.390e-02  -5.073 3.91e-07 ***
## contacttelephone   -1.000e-01  9.220e-02  -1.085  0.27802    
## contactunknown     -1.572e+00  8.583e-02 -18.312  < 2e-16 ***
## day                 1.305e-02  3.022e-03   4.319 1.57e-05 ***
## monthaug           -5.113e-01  9.453e-02  -5.408 6.36e-08 ***
## monthdec            8.351e-01  2.083e-01   4.009 6.10e-05 ***
## monthfeb            2.223e-02  1.073e-01   0.207  0.83589    
## monthjan           -1.146e+00  1.431e-01  -8.011 1.14e-15 ***
## monthjul           -7.486e-01  9.393e-02  -7.970 1.58e-15 ***
## monthjun            6.665e-01  1.123e-01   5.936 2.92e-09 ***
## monthmar            1.898e+00  1.489e-01  12.749  < 2e-16 ***
## monthmay           -2.234e-01  8.674e-02  -2.575  0.01002 *  
## monthnov           -7.434e-01  9.954e-02  -7.469 8.08e-14 ***
## monthoct            1.009e+00  1.335e-01   7.558 4.09e-14 ***
## monthsep            1.009e+00  1.504e-01   6.710 1.94e-11 ***
## duration            1.823e+00  3.343e-02  54.536  < 2e-16 ***
## campaign           -6.605e-02  1.218e-02  -5.423 5.86e-08 ***
## pdays              -3.582e-04  3.724e-04  -0.962  0.33624    
## previous            7.397e-03  6.701e-03   1.104  0.26964    
## poutcomeother       2.190e-01  1.098e-01   1.995  0.04605 *  
## poutcomesuccess     2.373e+00  1.004e-01  23.636  < 2e-16 ***
## poutcomeunknown    -7.274e-02  1.118e-01  -0.651  0.51528    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22686  on 29949  degrees of freedom
## Residual deviance: 14468  on 29907  degrees of freedom
## AIC: 14554
## 
## Number of Fisher Scoring iterations: 6
#CI and Odds Ratio
exp(cbind(OR = coef(slr_clean), confint(slr_clean)))
##                              OR        2.5 %       97.5 %
## (Intercept)        8.238035e-06 4.586338e-06 1.470443e-05
## age                9.982586e-01 9.931047e-01 1.003428e+00
## jobblue-collar     6.907386e-01 5.815715e-01 8.206011e-01
## jobentrepreneur    6.492240e-01 4.774057e-01 8.742601e-01
## jobhousemaid       6.655822e-01 4.847676e-01 9.053471e-01
## jobmanagement      8.789321e-01 7.385848e-01 1.046786e+00
## jobretired         1.184535e+00 9.415751e-01 1.489288e+00
## jobself-employed   7.075114e-01 5.448579e-01 9.144445e-01
## jobservices        7.458983e-01 6.100351e-01 9.104335e-01
## jobstudent         1.321072e+00 1.017118e+00 1.712901e+00
## jobtechnician      8.486758e-01 7.208675e-01 9.996945e-01
## jobunemployed      8.258990e-01 6.369023e-01 1.067015e+00
## jobunknown         7.722928e-01 4.370054e-01 1.317924e+00
## maritalmarried     8.144410e-01 7.077684e-01 9.388294e-01
## maritalsingle      1.055745e+00 8.999763e-01 1.240091e+00
## educationsecondary 1.127181e+00 9.710105e-01 1.310370e+00
## educationtertiary  1.312200e+00 1.102556e+00 1.563396e+00
## educationunknown   1.160081e+00 9.092406e-01 1.475521e+00
## defaultyes         9.941458e-01 5.066015e-01 1.805444e+00
## balance            1.059530e+00 1.030087e+00 1.089963e+00
## housingyes         5.070777e-01 4.575137e-01 5.618063e-01
## loanyes            6.873349e-01 5.937521e-01 7.933379e-01
## contacttelephone   9.048262e-01 7.540019e-01 1.082362e+00
## contactunknown     2.076993e-01 1.753722e-01 2.455365e-01
## day                1.013138e+00 1.007156e+00 1.019160e+00
## monthaug           5.997309e-01 4.983613e-01 7.219516e-01
## monthdec           2.304960e+00 1.531320e+00 3.467470e+00
## monthfeb           1.022479e+00 8.283444e-01 1.261651e+00
## monthjan           3.178169e-01 2.392228e-01 4.193238e-01
## monthjul           4.730108e-01 3.934298e-01 5.685986e-01
## monthjun           1.947429e+00 1.563029e+00 2.427478e+00
## monthmar           6.672033e+00 4.985704e+00 8.938227e+00
## monthmay           7.998143e-01 6.749911e-01 9.484193e-01
## monthnov           4.754843e-01 3.910070e-01 5.776678e-01
## monthoct           2.743465e+00 2.111369e+00 3.564041e+00
## monthsep           2.743562e+00 2.043306e+00 3.685271e+00
## duration           6.191233e+00 5.801204e+00 6.613618e+00
## campaign           9.360862e-01 9.136403e-01 9.583175e-01
## pdays              9.996419e-01 9.989099e-01 1.000370e+00
## previous           1.007424e+00 9.936424e-01 1.024152e+00
## poutcomeother      1.244848e+00 1.002730e+00 1.542196e+00
## poutcomesuccess    1.072602e+01 8.818966e+00 1.307187e+01
## poutcomeunknown    9.298421e-01 7.475710e-01 1.158990e+00
#predictions
slrPredict_clean = predict(slr_clean,newdata = test,type = 'response')


plot(slrPredict_clean,
     col = test$y,
     main = 'Simple Log Regression Predictions',
     ylab = 'Prediction',
     xlab = 'Data Index')

slrPredictnew_clean<-factor(ifelse(as.numeric(slrPredict_clean)>.5,'yes','no' ) ,levels = c('no','yes'))
confusionMatrix(slrPredictnew_clean,test$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6332  621
##        yes  175  359
##                                           
##                Accuracy : 0.8937          
##                  95% CI : (0.8865, 0.9006)
##     No Information Rate : 0.8691          
##     P-Value [Acc > NIR] : 5.054e-11       
##                                           
##                   Kappa : 0.4208          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9731          
##             Specificity : 0.3663          
##          Pos Pred Value : 0.9107          
##          Neg Pred Value : 0.6723          
##              Prevalence : 0.8691          
##          Detection Rate : 0.8457          
##    Detection Prevalence : 0.9287          
##       Balanced Accuracy : 0.6697          
##                                           
##        'Positive' Class : no              
## 
#Pearson Chi^2 Test
sum(residuals(slr_clean, type = "pearson")^2)
## [1] 37392.8
#Plotting Odds Ratios
plot_model(slr_clean,sort.est = TRUE,grid = TRUE,title = 'Odds Ratio for Explanitory Variables: SLR Clean')

# use full model for stepwise feature selection
StepwiseFeature =step(slr_clean,direction="both",trace=0)
summary(StepwiseFeature)
## 
## Call:
## glm(formula = y ~ job + marital + education + balance + housing + 
##     loan + contact + day + month + duration + campaign + poutcome, 
##     family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9765  -0.3961  -0.2115  -0.0953   4.0865  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -11.845378   0.259682 -45.615  < 2e-16 ***
## jobblue-collar      -0.369623   0.087719  -4.214 2.51e-05 ***
## jobentrepreneur     -0.433791   0.154128  -2.814  0.00489 ** 
## jobhousemaid        -0.414508   0.158900  -2.609  0.00909 ** 
## jobmanagement       -0.129444   0.088908  -1.456  0.14541    
## jobretired           0.138398   0.105632   1.310  0.19013    
## jobself-employed    -0.344286   0.131964  -2.609  0.00908 ** 
## jobservices         -0.292437   0.102061  -2.865  0.00417 ** 
## jobstudent           0.297777   0.130611   2.280  0.02261 *  
## jobtechnician       -0.162431   0.083367  -1.948  0.05137 .  
## jobunemployed       -0.192938   0.131591  -1.466  0.14260    
## jobunknown          -0.264413   0.280575  -0.942  0.34599    
## maritalmarried      -0.199212   0.071755  -2.776  0.00550 ** 
## maritalsingle        0.074076   0.076957   0.963  0.33577    
## educationsecondary   0.124322   0.075974   1.636  0.10176    
## educationtertiary    0.280224   0.088256   3.175  0.00150 ** 
## educationunknown     0.148184   0.123472   1.200  0.23009    
## balance              0.057168   0.014266   4.007 6.14e-05 ***
## housingyes          -0.678794   0.051923 -13.073  < 2e-16 ***
## loanyes             -0.373466   0.073838  -5.058 4.24e-07 ***
## contacttelephone    -0.110210   0.091217  -1.208  0.22696    
## contactunknown      -1.568797   0.085728 -18.300  < 2e-16 ***
## day                  0.013005   0.003022   4.304 1.68e-05 ***
## monthaug            -0.509909   0.094380  -5.403 6.56e-08 ***
## monthdec             0.837434   0.208387   4.019 5.85e-05 ***
## monthfeb             0.028521   0.107168   0.266  0.79014    
## monthjan            -1.138658   0.142940  -7.966 1.64e-15 ***
## monthjul            -0.745257   0.093886  -7.938 2.06e-15 ***
## monthjun             0.668745   0.112267   5.957 2.57e-09 ***
## monthmar             1.899374   0.148830  12.762  < 2e-16 ***
## monthmay            -0.224068   0.086669  -2.585  0.00973 ** 
## monthnov            -0.733985   0.098962  -7.417 1.20e-13 ***
## monthoct             1.010089   0.133336   7.576 3.58e-14 ***
## monthsep             1.008682   0.150363   6.708 1.97e-11 ***
## duration             1.822089   0.033407  54.542  < 2e-16 ***
## campaign            -0.065978   0.012176  -5.419 6.01e-08 ***
## poutcomeother        0.236366   0.109131   2.166  0.03032 *  
## poutcomesuccess      2.395740   0.097635  24.538  < 2e-16 ***
## poutcomeunknown     -0.010416   0.069424  -0.150  0.88074    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22686  on 29949  degrees of freedom
## Residual deviance: 14471  on 29911  degrees of freedom
## AIC: 14549
## 
## Number of Fisher Scoring iterations: 6
#stepwise feature selection model
slr_fsr = glm(y ~ job+marital+education+balance+housing+loan+contact+day+month+duration+campaign+pdays+poutcome, family = 'binomial',data = train)

#predictions from stepwise output 
slrPredict_fsr = predict(slr_fsr,newdata = test,type = 'response')

plot(slrPredict_fsr,
     col = test$y,
     main = 'Simple Log Regression Predictions: Stepwise',
     ylab = 'Prediction',
     xlab = 'Data Index')

slrPredictnew_fsr<-factor(ifelse(as.numeric(slrPredict_fsr)>.5,'yes','no' ) ,levels = c('no','yes'))
confusionMatrix(slrPredictnew_fsr,test$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6331  623
##        yes  176  357
##                                           
##                Accuracy : 0.8933          
##                  95% CI : (0.8861, 0.9002)
##     No Information Rate : 0.8691          
##     P-Value [Acc > NIR] : 1.033e-10       
##                                           
##                   Kappa : 0.4183          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9730          
##             Specificity : 0.3643          
##          Pos Pred Value : 0.9104          
##          Neg Pred Value : 0.6698          
##              Prevalence : 0.8691          
##          Detection Rate : 0.8456          
##    Detection Prevalence : 0.9288          
##       Balanced Accuracy : 0.6686          
##                                           
##        'Positive' Class : no              
## 
#Pearson Chi^2 Test
sum(residuals(slr_fsr, type = "pearson")^2)
## [1] 37457.62
#CI and Odds Ratio
exp(cbind(OR = coef(slr_fsr), confint(slr_fsr)))
##                              OR        2.5 %       97.5 %
## (Intercept)        7.804253e-06 4.554068e-06 1.328962e-05
## jobblue-collar     6.913510e-01 5.821975e-01 8.211762e-01
## jobentrepreneur    6.472094e-01 4.759785e-01 8.714525e-01
## jobhousemaid       6.595975e-01 4.807179e-01 8.966372e-01
## jobmanagement      8.783147e-01 7.381404e-01 1.045948e+00
## jobretired         1.145135e+00 9.305747e-01 1.408334e+00
## jobself-employed   7.077094e-01 5.450623e-01 9.146071e-01
## jobservices        7.469701e-01 6.109913e-01 9.116240e-01
## jobstudent         1.343348e+00 1.038891e+00 1.733977e+00
## jobtechnician      8.491104e-01 7.212735e-01 1.000156e+00
## jobunemployed      8.245130e-01 6.358364e-01 1.065218e+00
## jobunknown         7.661636e-01 4.337011e-01 1.306873e+00
## maritalmarried     8.182866e-01 7.115278e-01 9.427170e-01
## maritalsingle      1.075195e+00 9.252878e-01 1.251220e+00
## educationsecondary 1.132777e+00 9.767253e-01 1.315680e+00
## educationtertiary  1.321804e+00 1.112378e+00 1.572376e+00
## educationunknown   1.159672e+00 9.089180e-01 1.475005e+00
## balance            1.058503e+00 1.029378e+00 1.088601e+00
## housingyes         5.093871e-01 4.598629e-01 5.640388e-01
## loanyes            6.883000e-01 5.946752e-01 7.943313e-01
## contacttelephone   8.971375e-01 7.490108e-01 1.071102e+00
## contactunknown     2.074275e-01 1.751554e-01 2.451975e-01
## day                1.013077e+00 1.007097e+00 1.019096e+00
## monthaug           5.987972e-01 4.976896e-01 7.206763e-01
## monthdec           2.305990e+00 1.532071e+00 3.468801e+00
## monthfeb           1.024354e+00 8.299547e-01 1.263833e+00
## monthjan           3.184947e-01 2.397521e-01 4.201825e-01
## monthjul           4.736439e-01 3.939787e-01 5.693281e-01
## monthjun           1.948718e+00 1.564093e+00 2.429038e+00
## monthmar           6.669112e+00 4.983695e+00 8.934010e+00
## monthmay           8.009444e-01 6.759731e-01 9.497209e-01
## monthnov           4.752064e-01 3.908233e-01 5.772624e-01
## monthoct           2.737726e+00 2.107409e+00 3.555813e+00
## monthsep           2.743455e+00 2.043301e+00 3.684981e+00
## duration           6.189700e+00 5.799857e+00 6.611876e+00
## campaign           9.362348e-01 9.137952e-01 9.584601e-01
## pdays              9.996322e-01 9.989000e-01 1.000360e+00
## poutcomeother      1.260022e+00 1.015951e+00 1.559498e+00
## poutcomesuccess    1.072117e+01 8.815222e+00 1.306562e+01
## poutcomeunknown    9.095883e-01 7.340954e-01 1.129013e+00
#Plotting Odds Ratio
plot_model(slr_fsr,sort.est = TRUE,grid = TRUE,title = 'Odds Ratio for Explanitory Variables:Stepwise LR')

anova(slr_fsr,slr_clean, test = "Chisq",class(slr_clean))
## Analysis of Deviance Table
## 
## Model 1: y ~ job + marital + education + balance + housing + loan + contact + 
##     day + month + duration + campaign + pdays + poutcome
## Model 2: y ~ age + job + marital + education + default + balance + housing + 
##     loan + contact + day + month + duration + campaign + pdays + 
##     previous + poutcome
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1     29910      14470                     
## 2     29907      14468  3   1.6833   0.6407
#cv model
relaxed_cv_model = cv.glmnet(train_expl,train_resp,family = 'binomial',relax = TRUE)
relaxed_best_lambda = relaxed_cv_model$lambda.min
#relaxed_best_lambda

#plot cv model
coef(relaxed_cv_model,lambda = relaxed_best_lambda)
## 17 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) -9.056524739
## age          .          
## job          .          
## marital      .          
## education    0.179056934
## default      .          
## balance      0.095585138
## housing     -1.080342743
## loan        -0.655729480
## contact     -0.520875956
## day          .          
## month        .          
## duration     1.654767784
## campaign    -0.110197839
## pdays        0.002054007
## previous     0.073411054
## poutcome     .
plot(relaxed_cv_model,lambda = relaxed_best_lambda)

#head(train_expl)

#use variables from cv
#more complex uses different cut-off 
#creating a less "accurate model" but decreases the FNR as we want to determine who might default on a loan.
rlasso_pred = predict(relaxed_cv_model,test_expl,type = 'response',s = relaxed_best_lambda)

plot(rlasso_pred,col = test_resp,
     main = 'Log Regression Predictions: Relaxed CV LASSO',
     ylab = 'Prediction',
     xlab = 'Data Index')

#use .1 for cut-offf
new_pred<-factor(ifelse(as.numeric(rlasso_pred)>.1,'yes','no') ,levels = c('no','yes'))
confusionMatrix(new_pred,test_resp)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  4654  130
##        yes 1853  850
##                                          
##                Accuracy : 0.7351         
##                  95% CI : (0.725, 0.7451)
##     No Information Rate : 0.8691         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3335         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.7152         
##             Specificity : 0.8673         
##          Pos Pred Value : 0.9728         
##          Neg Pred Value : 0.3145         
##              Prevalence : 0.8691         
##          Detection Rate : 0.6216         
##    Detection Prevalence : 0.6390         
##       Balanced Accuracy : 0.7913         
##                                          
##        'Positive' Class : no             
## 
log.roc<-roc(response=test_resp,predictor=rlasso_pred,levels=c("no","yes"))

plot(log.roc,print.thres="best")
auc = auc(log.roc)
text(x = .4, y = .4,paste("AUC = ", round(auc,3), sep = ""))

anova(slr_fsr,relaxed_cv_model, test = "Chisq",class(relaxed_cv_model))
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                      29949      22686              
## job       11    534.0     29938      22152 < 2.2e-16 ***
## marital    2     80.1     29936      22072 < 2.2e-16 ***
## education  3     58.3     29933      22014 1.333e-12 ***
## balance    1    157.4     29932      21856 < 2.2e-16 ***
## housing    1    445.7     29931      21410 < 2.2e-16 ***
## loan       1     83.9     29930      21326 < 2.2e-16 ***
## contact    2    529.5     29928      20797 < 2.2e-16 ***
## day        1     31.9     29927      20765 1.607e-08 ***
## month     11    926.0     29916      19839 < 2.2e-16 ***
## duration   1   4361.0     29915      15478 < 2.2e-16 ***
## campaign   1     56.9     29914      15421 4.651e-14 ***
## pdays      1     74.7     29913      15346 < 2.2e-16 ***
## poutcome   3    876.9     29910      14470 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1